Pima Indians diabetes dataset

Las variables son las siguientes:

  1. npreg: numero de embarazos (variable discreta, entero)
  2. glu: nivel de glucosa (variable discreta, entero)
  3. bp: presion sanguinea (variable discreta)
  4. skin: Grosor de la piel (en mm, variable continua)
  5. bmi: Indice corporal (variable continua)
  6. ped: Funcion de pedigree de diabetes (variable continua)
  7. age: edad (variable discreta)
  8. type: factor booleana que muestra si el individuo tiene tiene diabetes o no
data(Pima.tr2, package="MASS")

class(Pima.tr2)
## [1] "data.frame"
str(Pima.tr2)
## 'data.frame':    300 obs. of  8 variables:
##  $ npreg: int  5 7 5 0 0 5 3 1 3 2 ...
##  $ glu  : int  86 195 77 165 107 97 83 193 142 128 ...
##  $ bp   : int  68 70 82 76 60 76 58 50 80 78 ...
##  $ skin : int  28 33 41 43 25 27 31 16 15 37 ...
##  $ bmi  : num  30.2 25.1 35.8 47.9 26.4 35.6 34.3 25.9 32.4 43.3 ...
##  $ ped  : num  0.364 0.163 0.156 0.259 0.133 ...
##  $ age  : int  24 55 35 26 23 52 25 24 63 31 ...
##  $ type : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 1 1 2 ...
glimpse(Pima.tr2)
## Observations: 300
## Variables: 8
## $ npreg <int> 5, 7, 5, 0, 0, 5, 3, 1, 3, 2, 0, 9, 1, 12, 1, 4, 1, 11, ...
## $ glu   <int> 86, 195, 77, 165, 107, 97, 83, 193, 142, 128, 137, 154, ...
## $ bp    <int> 68, 70, 82, 76, 60, 76, 58, 50, 80, 78, 40, 78, 60, 62, ...
## $ skin  <int> 28, 33, 41, 43, 25, 27, 31, 16, 15, 37, 35, 30, 23, 7, 5...
## $ bmi   <dbl> 30.2, 25.1, 35.8, 47.9, 26.4, 35.6, 34.3, 25.9, 32.4, 43...
## $ ped   <dbl> 0.364, 0.163, 0.156, 0.259, 0.133, 0.378, 0.336, 0.655, ...
## $ age   <int> 24, 55, 35, 26, 23, 52, 25, 24, 63, 31, 33, 45, 59, 44, ...
## $ type  <fct> No, Yes, No, No, No, Yes, No, No, No, Yes, Yes, No, Yes,...
summary(Pima.tr2)
##      npreg             glu              bp              skin      
##  Min.   : 0.000   Min.   : 56.0   Min.   : 38.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.:101.0   1st Qu.: 64.00   1st Qu.:21.00  
##  Median : 3.000   Median :121.0   Median : 72.00   Median :29.00  
##  Mean   : 3.787   Mean   :123.7   Mean   : 72.32   Mean   :29.15  
##  3rd Qu.: 6.000   3rd Qu.:142.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :14.000   Max.   :199.0   Max.   :114.00   Max.   :99.00  
##                                   NA's   :13       NA's   :98     
##       bmi             ped              age        type    
##  Min.   :18.20   Min.   :0.0780   Min.   :21.0   No :194  
##  1st Qu.:27.10   1st Qu.:0.2367   1st Qu.:24.0   Yes:106  
##  Median :32.00   Median :0.3360   Median :29.0            
##  Mean   :32.05   Mean   :0.4357   Mean   :33.1            
##  3rd Qu.:36.50   3rd Qu.:0.5867   3rd Qu.:40.0            
##  Max.   :52.90   Max.   :2.2880   Max.   :72.0            
##  NA's   :3
h1 <- ggplot(Pima.tr2, aes(glu)) + geom_histogram()
h2 <- ggplot(Pima.tr2, aes(bp)) + geom_histogram()
h3 <- ggplot(Pima.tr2, aes(skin)) + geom_histogram()
h4 <- ggplot(Pima.tr2, aes(bmi)) + geom_histogram()
h5 <- ggplot(Pima.tr2, aes(ped)) + geom_histogram()
h6 <- ggplot(Pima.tr2, aes(age)) + geom_histogram()
grid.arrange (h1, h2, h3, h4, h5, h6, nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 98 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

PimaV <- select(Pima.tr2, glu:age)
# para especificar como queremos que se vean las graficas.
par(mar=c(3.1, 4.1, 1.1, 2.1))
#scale estandariza los datos
# outlier se pone en rojo con la figura 16
boxplot(scale(PimaV), pch=16, outcol="red")

ggpairs(data=PimaV, diag=list(continuous='densityDiag'),
        axisLabels='show')
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 13 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 98 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 3 rows containing missing values
## Warning: Removed 13 rows containing missing values (geom_point).
## Warning: Removed 13 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 99 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 16 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 13 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 13 rows containing missing values
## Warning: Removed 98 rows containing missing values (geom_point).
## Warning: Removed 99 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 99 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 98 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 98 rows containing missing values
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 99 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 3 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 3 rows containing missing values
## Warning: Removed 13 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 13 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).

La primera gráfica que se muestra es simplemente para ver las ditribuciones de las variables y su sesgo.
La segunda gráfica es para ver si la injerencia que cada una de las variables tienen sobre la variable de respuesta “type”.

En la segunda gráfica se puede observar que por sí solas las variables no influyen directamente en el resultado del pacience en cuanto a diabetes, todas las medianas se encuentran muy cercanas al cero.

La última gráfica sirve para la relación entre las variables independientes (correlación, en la diagonal su distribucion y en la diagonal inferior como se comportan los datos)

Estas gráficas no sirven para dar información sobre el efecto que las variables independientes tienen sobre la variable objetivo.

Die linked voters

library(GDAdata)

data(btw2009, package = "flexclust")

class(btw2009)
## [1] "data.frame"
str(btw2009)
## 'data.frame':    299 obs. of  17 variables:
##  $ state   : Factor w/ 16 levels "Baden-Wuerttemberg",..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ eligible: int  225216 187047 177604 198910 200347 175132 232899 240010 179987 237144 ...
##  $ votes   : int  163329 134671 128320 150018 145483 127971 176188 180295 131917 181568 ...
##  $ invalid1: int  4117 3382 3348 3404 3442 3292 3384 3674 4043 3884 ...
##  $ invalid2: int  4181 3218 3030 3026 2263 3031 3193 3303 3094 3259 ...
##  $ valid1  : int  159212 131289 124972 146614 142041 124679 172804 176621 127874 177684 ...
##  $ valid2  : int  159148 131453 125290 146992 143220 124940 172995 176992 128823 178309 ...
##  $ SPD1    : int  52139 36642 37277 47610 54398 41683 54950 54065 43765 53073 ...
##  $ SPD2    : int  41793 32239 31282 39253 42369 35557 45540 44445 36329 45354 ...
##  $ UNION1  : int  61793 56751 49779 58876 42733 48136 70458 70290 49363 70874 ...
##  $ UNION2  : int  51068 47947 43480 49789 36397 40110 57203 58225 42193 59075 ...
##  $ GRUENE1 : int  16399 12840 10665 14790 18699 12187 16143 16633 10411 18761 ...
##  $ GRUENE2 : int  21967 14927 12899 18751 24659 15701 20978 21245 13651 22240 ...
##  $ FDP1    : int  15292 15184 16037 14317 12188 12515 17018 20084 14397 19742 ...
##  $ FDP2    : int  24187 22605 21970 23442 19156 19772 28640 31407 22701 30714 ...
##  $ LINKE1  : int  11918 8208 8896 8729 11817 7871 11794 12527 8235 12351 ...
##  $ LINKE2  : int  13481 8836 9808 9821 13430 8936 13412 14280 9342 13855 ...
glimpse(btw2009)
## Observations: 299
## Variables: 17
## $ state    <fct> Schleswig-Holstein, Schleswig-Holstein, Schleswig-Hol...
## $ eligible <int> 225216, 187047, 177604, 198910, 200347, 175132, 23289...
## $ votes    <int> 163329, 134671, 128320, 150018, 145483, 127971, 17618...
## $ invalid1 <int> 4117, 3382, 3348, 3404, 3442, 3292, 3384, 3674, 4043,...
## $ invalid2 <int> 4181, 3218, 3030, 3026, 2263, 3031, 3193, 3303, 3094,...
## $ valid1   <int> 159212, 131289, 124972, 146614, 142041, 124679, 17280...
## $ valid2   <int> 159148, 131453, 125290, 146992, 143220, 124940, 17299...
## $ SPD1     <int> 52139, 36642, 37277, 47610, 54398, 41683, 54950, 5406...
## $ SPD2     <int> 41793, 32239, 31282, 39253, 42369, 35557, 45540, 4444...
## $ UNION1   <int> 61793, 56751, 49779, 58876, 42733, 48136, 70458, 7029...
## $ UNION2   <int> 51068, 47947, 43480, 49789, 36397, 40110, 57203, 5822...
## $ GRUENE1  <int> 16399, 12840, 10665, 14790, 18699, 12187, 16143, 1663...
## $ GRUENE2  <int> 21967, 14927, 12899, 18751, 24659, 15701, 20978, 2124...
## $ FDP1     <int> 15292, 15184, 16037, 14317, 12188, 12515, 17018, 2008...
## $ FDP2     <int> 24187, 22605, 21970, 23442, 19156, 19772, 28640, 3140...
## $ LINKE1   <int> 11918, 8208, 8896, 8729, 11817, 7871, 11794, 12527, 8...
## $ LINKE2   <int> 13481, 8836, 9808, 9821, 13430, 8936, 13412, 14280, 9...
summary(btw2009)
##                  state       eligible          votes           invalid1   
##  Nordrhein-Westfalen:64   Min.   :154767   Min.   : 94922   Min.   :1198  
##  Bayern             :45   1st Qu.:190016   1st Qu.:133038   1st Qu.:1888  
##  Baden-Wuerttemberg :38   Median :207451   Median :146160   Median :2374  
##  Niedersachsen      :30   Mean   :207921   Mean   :147176   Mean   :2534  
##  Hessen             :21   3rd Qu.:224707   3rd Qu.:160939   3rd Qu.:3098  
##  Sachsen            :16   Max.   :256131   Max.   :191097   Max.   :5598  
##  (Other)            :85                                                   
##     invalid2        valid1           valid2            SPD1      
##  Min.   : 974   Min.   : 93398   Min.   : 93873   Min.   :14040  
##  1st Qu.:1552   1st Qu.:130464   1st Qu.:130780   1st Qu.:28807  
##  Median :1971   Median :143231   Median :143546   Median :40464  
##  Mean   :2122   Mean   :144642   Mean   :145054   Mean   :40401  
##  3rd Qu.:2564   3rd Qu.:158170   3rd Qu.:158481   3rd Qu.:50988  
##  Max.   :4959   Max.   :188127   Max.   :188728   Max.   :73215  
##                                                                  
##       SPD2           UNION1           UNION2         GRUENE1     
##  Min.   :13611   Min.   : 18394   Min.   :18788   Min.   : 3725  
##  1st Qu.:25481   1st Qu.: 46464   1st Qu.:40868   1st Qu.: 9008  
##  Median :32691   Median : 56503   Median :49267   Median :12190  
##  Mean   :33413   Mean   : 57016   Mean   :49025   Mean   :13436  
##  3rd Qu.:40946   3rd Qu.: 66090   3rd Qu.:55608   3rd Qu.:16142  
##  Max.   :59493   Max.   :101261   Max.   :86686   Max.   :73897  
##                                                   NA's   :3      
##     GRUENE2           FDP1            FDP2           LINKE1     
##  Min.   : 4238   Min.   : 4885   Min.   : 7872   Min.   : 4868  
##  1st Qu.:10906   1st Qu.:10665   1st Qu.:16805   1st Qu.: 9066  
##  Median :14609   Median :12982   Median :20829   Median :10936  
##  Mean   :15529   Mean   :13634   Mean   :21124   Mean   :16132  
##  3rd Qu.:18894   3rd Qu.:15970   3rd Qu.:25014   3rd Qu.:14083  
##  Max.   :43347   Max.   :31606   Max.   :38125   Max.   :62880  
##                                                  NA's   :2      
##      LINKE2     
##  Min.   : 5641  
##  1st Qu.:10406  
##  Median :12596  
##  Mean   :17244  
##  3rd Qu.:16266  
##  Max.   :56938  
## 
btw2009 <- within(btw2009, Linke2 <- 100*LINKE2/valid2)

ggplot(btw2009, aes(Linke2)) + 
  geom_histogram(fill = "mediumpurple",binwidth = 1) + 
  ylab("") +
  xlab("Percentage voter support for Die Linke in 2009")

Galton

library(UsingR)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
## 
##     cancer
data(galton, package="UsingR")

class(galton)
## [1] "data.frame"
str(galton)
## 'data.frame':    928 obs. of  2 variables:
##  $ child : num  61.7 61.7 61.7 61.7 61.7 62.2 62.2 62.2 62.2 62.2 ...
##  $ parent: num  70.5 68.5 65.5 64.5 64 67.5 67.5 67.5 66.5 66.5 ...
glimpse(galton)
## Observations: 928
## Variables: 2
## $ child  <dbl> 61.7, 61.7, 61.7, 61.7, 61.7, 62.2, 62.2, 62.2, 62.2, 6...
## $ parent <dbl> 70.5, 68.5, 65.5, 64.5, 64.0, 67.5, 67.5, 67.5, 66.5, 6...
summary(galton)
##      child           parent     
##  Min.   :61.70   Min.   :64.00  
##  1st Qu.:66.20   1st Qu.:67.50  
##  Median :68.20   Median :68.50  
##  Mean   :68.09   Mean   :68.31  
##  3rd Qu.:70.20   3rd Qu.:69.50  
##  Max.   :73.70   Max.   :73.00
ht <- "height (in)"

par(mfrow=c(1,2), las=1, mar=c(3.1, 4.1, 1.1, 2.1))

with(galton, {
  hist(child, xlab=ht, main="Children", col="green")
  hist(parent, xlab=ht, main="Parents", col="blue")})

par(mfrow=c(1,2), las=1, mar=c(3.1, 4.1, 1.1, 2.1))
with(galton, {
  MASS::truehist(child, h=0.1)
  MASS::truehist(parent, h=0.1)})

c1 <- ggplot(galton, aes(child)) + geom_histogram(binwidth=1) +
  xlim(60, 75) + ylim(0, 225) + ylab("") +
  geom_vline(xintercept=median(galton$child),
             col="red")

p1 <- ggplot(galton, aes(parent)) + geom_histogram(binwidth=1) +
  xlim(60, 75) + ylim(0, 225) + ylab("") +
  geom_vline(xintercept=median(galton$parent),col="red")

grid.arrange(c1, p1)

Altura Pearson

data(father.son, package="UsingR")

class(father.son)
## [1] "data.frame"
str(father.son)
## 'data.frame':    1078 obs. of  2 variables:
##  $ fheight: num  65 63.3 65 65.8 61.1 ...
##  $ sheight: num  59.8 63.2 63.3 62.8 64.3 ...
glimpse(father.son)
## Observations: 1,078
## Variables: 2
## $ fheight <dbl> 65.04851, 63.25094, 64.95532, 65.75250, 61.13723, 63.0...
## $ sheight <dbl> 59.77827, 63.21404, 63.34242, 62.79238, 64.28113, 64.2...
summary(father.son)
##     fheight         sheight     
##  Min.   :59.01   Min.   :58.51  
##  1st Qu.:65.79   1st Qu.:66.93  
##  Median :67.77   Median :68.62  
##  Mean   :67.69   Mean   :68.68  
##  3rd Qu.:69.60   3rd Qu.:70.47  
##  Max.   :75.43   Max.   :78.36
c2 <- ggplot(father.son, aes(sheight)) +
  geom_histogram(aes(y = ..density..), binwidth=1) +
  geom_density() + xlim(58, 80) + ylim(0, 0.16) +
  xlab("ht (inches)") + ylab("") + ggtitle("Sons")

p2 <- ggplot(father.son, aes(fheight)) +
  geom_histogram(aes(y = ..density..), binwidth=1) +
  geom_density() + xlim(58, 80) + ylim(0, 0.16) + 
  xlab("ht (inches)") + ylab("") +
  ggtitle("Fathers")

grid.arrange(c2, p2, nrow = 1)

par(mfrow=c(1,2), las=1, mar=c(3.1, 4.1, 1.1, 2.1))
with(father.son, {
  qqnorm(sheight, main="Sons", xlab="",
         ylab="", pch=16, ylim=c(55,80)) 
  qqline(sheight)
  qqnorm(fheight, main="Fathers", xlab="",
         ylab="", pch=16, ylim=c(55,80))
  qqline(fheight)})

shapiro.test(father.son$fheight)
## 
##  Shapiro-Wilk normality test
## 
## data:  father.son$fheight
## W = 0.99791, p-value = 0.1962
shapiro.test(father.son$sheight)
## 
##  Shapiro-Wilk normality test
## 
## data:  father.son$sheight
## W = 0.99642, p-value = 0.0142

Hill reaces

class(MASS::hills)
## [1] "data.frame"
str(MASS::hills)
## 'data.frame':    35 obs. of  3 variables:
##  $ dist : num  2.5 6 6 7.5 8 8 16 6 5 6 ...
##  $ climb: int  650 2500 900 800 3070 2866 7500 800 800 650 ...
##  $ time : num  16.1 48.4 33.6 45.6 62.3 ...
glimpse(MASS::hills)
## Observations: 35
## Variables: 3
## $ dist  <dbl> 2.5, 6.0, 6.0, 7.5, 8.0, 8.0, 16.0, 6.0, 5.0, 6.0, 28.0,...
## $ climb <int> 650, 2500, 900, 800, 3070, 2866, 7500, 800, 800, 650, 21...
## $ time  <dbl> 16.083, 48.350, 33.650, 45.600, 62.267, 73.217, 204.617,...
summary(MASS::hills)
##       dist            climb           time       
##  Min.   : 2.000   Min.   : 300   Min.   : 15.95  
##  1st Qu.: 4.500   1st Qu.: 725   1st Qu.: 28.00  
##  Median : 6.000   Median :1000   Median : 39.75  
##  Mean   : 7.529   Mean   :1815   Mean   : 57.88  
##  3rd Qu.: 8.000   3rd Qu.:2200   3rd Qu.: 68.62  
##  Max.   :28.000   Max.   :7500   Max.   :204.62
par(mfrow=c(1,1), mar=c(3.1, 4.1, 1.1, 2.1))
with(MASS::hills,
     boxplot(time, horizontal=TRUE, pch=16, ylim=c(0, 220)))

with(MASS::hills, {
  MASS::truehist(time)})

with(MASS::hills, {
  hist(time)})

ggplot(MASS::hills, aes(time)) +
  geom_histogram(binwidth = 10)

# Boston Housing

class(MASS::Boston)
## [1] "data.frame"
str(MASS::Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
glimpse(MASS::Boston)
## Observations: 506
## Variables: 14
## $ crim    <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, ...
## $ zn      <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5,...
## $ indus   <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, ...
## $ chas    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ nox     <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524...
## $ rm      <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172...
## $ age     <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0,...
## $ dis     <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605...
## $ rad     <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, ...
## $ tax     <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311,...
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, ...
## $ black   <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60...
## $ lstat   <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.9...
## $ medv    <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, ...
summary(MASS::Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
ggplot(MASS::Boston, aes(medv)) + geom_histogram() + ylab("") + 
  xlab("Median housing value (thousands of dollars)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(MASS)

with(MASS::Boston, {truehist(medv, h=1)})

library(tidyr)

B2 <- gather(MASS::Boston, BosVars, BosValues, crim:medv)

ggplot(B2, aes(BosValues)) + geom_histogram() + xlab("") +
    ylab("") + facet_wrap(~ BosVars, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(MASS)
  
with(Boston, hist(ptratio))

with(Boston, truehist(ptratio))

boxplot(Boston$medv, pch=16)

stripchart(Boston$medv, method="jitter", pch=16)

stem(Boston$medv)
## 
##   The decimal point is at the |
## 
##    4 | 006
##    6 | 30022245
##    8 | 1334455788567
##   10 | 2224455899035778899
##   12 | 013567778011112333444455668888899
##   14 | 0111233445556689990001222344666667
##   16 | 01112234556677880111222344455567888889
##   18 | 01222334445555667778899990011112233333444444555566666778889999
##   20 | 0000011111223333444455566666677888990001122222444445566777777788999
##   22 | 00000001222223344555666667788889999000011111112222333344566777788889
##   24 | 001112333444455566777888800000000123
##   26 | 24456667011555599
##   28 | 01244567770011466889
##   30 | 111357801255667
##   32 | 0024579011223448
##   34 | 679991244
##   36 | 01224502369
##   38 | 78
##   40 | 37
##   42 | 38158
##   44 | 084
##   46 | 07
##   48 | 358
##   50 | 0000000000000000
library(ash)
plot(ash1(bin1(Boston$medv, nbin=50)), type="l")
## [1] "ash estimate nonzero outside interval ab"

d1 <- density(Boston$medv)
plot(d1, ylim=c(0,0.08))
rug(Boston$medv)
lines(density(Boston$medv, d1$bw/2), col="green")
lines(density(Boston$medv, d1$bw/5), col="blue")

Votaciones

data(btw2009, package = "flexclust")

btw2009 <- within(btw2009, Bundesland <- state)
levels(btw2009$state)
##  [1] "Baden-Wuerttemberg"     "Bayern"                
##  [3] "Berlin"                 "Brandenburg"           
##  [5] "Bremen"                 "Hamburg"               
##  [7] "Hessen"                 "Mecklenburg-Vorpommern"
##  [9] "Niedersachsen"          "Nordrhein-Westfalen"   
## [11] "Rheinland-Pfalz"        "Saarland"              
## [13] "Sachsen"                "Sachsen-Anhalt"        
## [15] "Schleswig-Holstein"     "Thueringen"
class(btw2009)
## [1] "data.frame"
str(btw2009)
## 'data.frame':    299 obs. of  18 variables:
##  $ state     : Factor w/ 16 levels "Baden-Wuerttemberg",..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ eligible  : int  225216 187047 177604 198910 200347 175132 232899 240010 179987 237144 ...
##  $ votes     : int  163329 134671 128320 150018 145483 127971 176188 180295 131917 181568 ...
##  $ invalid1  : int  4117 3382 3348 3404 3442 3292 3384 3674 4043 3884 ...
##  $ invalid2  : int  4181 3218 3030 3026 2263 3031 3193 3303 3094 3259 ...
##  $ valid1    : int  159212 131289 124972 146614 142041 124679 172804 176621 127874 177684 ...
##  $ valid2    : int  159148 131453 125290 146992 143220 124940 172995 176992 128823 178309 ...
##  $ SPD1      : int  52139 36642 37277 47610 54398 41683 54950 54065 43765 53073 ...
##  $ SPD2      : int  41793 32239 31282 39253 42369 35557 45540 44445 36329 45354 ...
##  $ UNION1    : int  61793 56751 49779 58876 42733 48136 70458 70290 49363 70874 ...
##  $ UNION2    : int  51068 47947 43480 49789 36397 40110 57203 58225 42193 59075 ...
##  $ GRUENE1   : int  16399 12840 10665 14790 18699 12187 16143 16633 10411 18761 ...
##  $ GRUENE2   : int  21967 14927 12899 18751 24659 15701 20978 21245 13651 22240 ...
##  $ FDP1      : int  15292 15184 16037 14317 12188 12515 17018 20084 14397 19742 ...
##  $ FDP2      : int  24187 22605 21970 23442 19156 19772 28640 31407 22701 30714 ...
##  $ LINKE1    : int  11918 8208 8896 8729 11817 7871 11794 12527 8235 12351 ...
##  $ LINKE2    : int  13481 8836 9808 9821 13430 8936 13412 14280 9342 13855 ...
##  $ Bundesland: Factor w/ 16 levels "Baden-Wuerttemberg",..: 15 15 15 15 15 15 15 15 15 15 ...
glimpse(btw2009)
## Observations: 299
## Variables: 18
## $ state      <fct> Schleswig-Holstein, Schleswig-Holstein, Schleswig-H...
## $ eligible   <int> 225216, 187047, 177604, 198910, 200347, 175132, 232...
## $ votes      <int> 163329, 134671, 128320, 150018, 145483, 127971, 176...
## $ invalid1   <int> 4117, 3382, 3348, 3404, 3442, 3292, 3384, 3674, 404...
## $ invalid2   <int> 4181, 3218, 3030, 3026, 2263, 3031, 3193, 3303, 309...
## $ valid1     <int> 159212, 131289, 124972, 146614, 142041, 124679, 172...
## $ valid2     <int> 159148, 131453, 125290, 146992, 143220, 124940, 172...
## $ SPD1       <int> 52139, 36642, 37277, 47610, 54398, 41683, 54950, 54...
## $ SPD2       <int> 41793, 32239, 31282, 39253, 42369, 35557, 45540, 44...
## $ UNION1     <int> 61793, 56751, 49779, 58876, 42733, 48136, 70458, 70...
## $ UNION2     <int> 51068, 47947, 43480, 49789, 36397, 40110, 57203, 58...
## $ GRUENE1    <int> 16399, 12840, 10665, 14790, 18699, 12187, 16143, 16...
## $ GRUENE2    <int> 21967, 14927, 12899, 18751, 24659, 15701, 20978, 21...
## $ FDP1       <int> 15292, 15184, 16037, 14317, 12188, 12515, 17018, 20...
## $ FDP2       <int> 24187, 22605, 21970, 23442, 19156, 19772, 28640, 31...
## $ LINKE1     <int> 11918, 8208, 8896, 8729, 11817, 7871, 11794, 12527,...
## $ LINKE2     <int> 13481, 8836, 9808, 9821, 13430, 8936, 13412, 14280,...
## $ Bundesland <fct> Schleswig-Holstein, Schleswig-Holstein, Schleswig-H...
summary(btw2009)
##                  state       eligible          votes           invalid1   
##  Nordrhein-Westfalen:64   Min.   :154767   Min.   : 94922   Min.   :1198  
##  Bayern             :45   1st Qu.:190016   1st Qu.:133038   1st Qu.:1888  
##  Baden-Wuerttemberg :38   Median :207451   Median :146160   Median :2374  
##  Niedersachsen      :30   Mean   :207921   Mean   :147176   Mean   :2534  
##  Hessen             :21   3rd Qu.:224707   3rd Qu.:160939   3rd Qu.:3098  
##  Sachsen            :16   Max.   :256131   Max.   :191097   Max.   :5598  
##  (Other)            :85                                                   
##     invalid2        valid1           valid2            SPD1      
##  Min.   : 974   Min.   : 93398   Min.   : 93873   Min.   :14040  
##  1st Qu.:1552   1st Qu.:130464   1st Qu.:130780   1st Qu.:28807  
##  Median :1971   Median :143231   Median :143546   Median :40464  
##  Mean   :2122   Mean   :144642   Mean   :145054   Mean   :40401  
##  3rd Qu.:2564   3rd Qu.:158170   3rd Qu.:158481   3rd Qu.:50988  
##  Max.   :4959   Max.   :188127   Max.   :188728   Max.   :73215  
##                                                                  
##       SPD2           UNION1           UNION2         GRUENE1     
##  Min.   :13611   Min.   : 18394   Min.   :18788   Min.   : 3725  
##  1st Qu.:25481   1st Qu.: 46464   1st Qu.:40868   1st Qu.: 9008  
##  Median :32691   Median : 56503   Median :49267   Median :12190  
##  Mean   :33413   Mean   : 57016   Mean   :49025   Mean   :13436  
##  3rd Qu.:40946   3rd Qu.: 66090   3rd Qu.:55608   3rd Qu.:16142  
##  Max.   :59493   Max.   :101261   Max.   :86686   Max.   :73897  
##                                                   NA's   :3      
##     GRUENE2           FDP1            FDP2           LINKE1     
##  Min.   : 4238   Min.   : 4885   Min.   : 7872   Min.   : 4868  
##  1st Qu.:10906   1st Qu.:10665   1st Qu.:16805   1st Qu.: 9066  
##  Median :14609   Median :12982   Median :20829   Median :10936  
##  Mean   :15529   Mean   :13634   Mean   :21124   Mean   :16132  
##  3rd Qu.:18894   3rd Qu.:15970   3rd Qu.:25014   3rd Qu.:14083  
##  Max.   :43347   Max.   :31606   Max.   :38125   Max.   :62880  
##                                                  NA's   :2      
##      LINKE2                    Bundesland
##  Min.   : 5641   Nordrhein-Westfalen:64  
##  1st Qu.:10406   Bayern             :45  
##  Median :12596   Baden-Wuerttemberg :38  
##  Mean   :17244   Niedersachsen      :30  
##  3rd Qu.:16266   Hessen             :21  
##  Max.   :56938   Sachsen            :16  
##                  (Other)            :85
btw2009 <- within (btw2009, stateA <- state)
btw2009 <- within (btw2009,
                   levels(stateA) <- c("BW", "BY", "BE",
                                       "BB", "HB", "HH", 
                                       "HE", "MV", "NI", 
                                       "NW", "RP", "SL", 
                                       "SN", "ST", "SH", "TH"))


ggplot(btw2009, aes(state, LINKE2)) + geom_boxplot(varwidth=TRUE) + ylab("") +
 theme(axis.text.x=element_text(angle=90,hjust=1)) 

Voters <- with(btw2009, size <- tapply(eligible, stateA, sum))

Voters
##       BW       BY       BE       BB       HB       HH       HE       MV 
##  7633818  9382583  2471665  2128715   487978  1256634  4398919  1400298 
##       NI       NW       RP       SL       SN       ST       SH       TH 
##  6112110 13288291  3103878   808554  3518195  2028572  2234720  1913559
Bundesland <- rownames(Voters)

btw9s <- data.frame(Bundesland, Voters)
btw9s$EW <- c("West")
btw9s[c("BB", "BE", "MV", "SN", "ST", "TH"), "EW"] <- "East"

ls <- with(btw9s, Bundesland[order(EW, -Voters)])
ls
##  [1] SN BE BB ST TH MV NW BY BW NI HE RP SH HH SL HB
## Levels: BB BE BW BY HB HE HH MV NI NW RP SH SL SN ST TH
btw9s <- within(btw9s, State1 <- factor(Bundesland, levels=ls))
    
b1 <- ggplot(btw9s, aes(Bundesland, Voters/1000000)) +
  geom_bar(stat="identity") +
  ylab("Voters (millions)")

b2 <- ggplot(btw9s, aes(reorder(Bundesland, -Voters),Voters/1000000)) + 
  geom_bar(stat="identity") +
  xlab("Bundesland") +
  ylab("Vo (millions)")
    
b3 <- ggplot(btw9s, aes(State1, Voters/1000000)) +
  geom_bar(stat="identity") + 
  xlab("Bundesland") +
  ylab("Vo (millions)")

grid.arrange(b1, b2, b3)

Anorexia

# Anorexia

data(anorexia, package="MASS")

class(anorexia)
## [1] "data.frame"
str(anorexia)
## 'data.frame':    72 obs. of  3 variables:
##  $ Treat : Factor w/ 3 levels "CBT","Cont","FT": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Prewt : num  80.7 89.4 91.8 74 78.1 88.3 87.3 75.1 80.6 78.4 ...
##  $ Postwt: num  80.2 80.1 86.4 86.3 76.1 78.1 75.1 86.7 73.5 84.6 ...
glimpse(anorexia)
## Observations: 72
## Variables: 3
## $ Treat  <fct> Cont, Cont, Cont, Cont, Cont, Cont, Cont, Cont, Cont, C...
## $ Prewt  <dbl> 80.7, 89.4, 91.8, 74.0, 78.1, 88.3, 87.3, 75.1, 80.6, 7...
## $ Postwt <dbl> 80.2, 80.1, 86.4, 86.3, 76.1, 78.1, 75.1, 86.7, 73.5, 8...
summary(anorexia)
##   Treat        Prewt           Postwt      
##  CBT :29   Min.   :70.00   Min.   : 71.30  
##  Cont:26   1st Qu.:79.60   1st Qu.: 79.33  
##  FT  :17   Median :82.30   Median : 84.05  
##            Mean   :82.41   Mean   : 85.17  
##            3rd Qu.:86.00   3rd Qu.: 91.55  
##            Max.   :94.90   Max.   :103.60
ggplot(anorexia, aes(Treat)) + geom_bar() + xlab("Treatment")

with(anorexia, table(Treat))
## Treat
##  CBT Cont   FT 
##   29   26   17

Titanic

class(Titanic)
## [1] "table"
Titanic1 <- data.frame(Titanic)

class(Titanic1)
## [1] "data.frame"
str(Titanic1)
## 'data.frame':    32 obs. of  5 variables:
##  $ Class   : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Sex     : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
##  $ Age     : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
##  $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : num  0 0 35 0 0 0 17 0 118 154 ...
glimpse(Titanic1)
## Observations: 32
## Variables: 5
## $ Class    <fct> 1st, 2nd, 3rd, Crew, 1st, 2nd, 3rd, Crew, 1st, 2nd, 3...
## $ Sex      <fct> Male, Male, Male, Male, Female, Female, Female, Femal...
## $ Age      <fct> Child, Child, Child, Child, Child, Child, Child, Chil...
## $ Survived <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N...
## $ Freq     <dbl> 0, 0, 35, 0, 0, 0, 17, 0, 118, 154, 387, 670, 4, 13, ...
summary(Titanic1)
##   Class       Sex        Age     Survived      Freq       
##  1st :8   Male  :16   Child:16   No :16   Min.   :  0.00  
##  2nd :8   Female:16   Adult:16   Yes:16   1st Qu.:  0.75  
##  3rd :8                                   Median : 13.50  
##  Crew:8                                   Mean   : 68.78  
##                                           3rd Qu.: 77.00  
##                                           Max.   :670.00
p <- ggplot(Titanic1, aes(weight=Freq)) +
  ylab("") + ylim(0,2250)

cs <- p + aes(Class) + geom_bar(fill="blue")
sx <- p + aes(Sex) + geom_bar(fill="green")
ag <- p + aes(Age) + geom_bar(fill="tan2")
su <- p + aes(Survived) + geom_bar(fill="red")

grid.arrange(cs, sx, ag, su, nrow=1, widths=c(3, 2, 2, 2))

Polls

Party <- c("Fine Gael", "La", "Fianna Fail",
           "Sinn Fein", "In", "Green", "Don’t know") 
nos <- c(181, 51, 171, 119, 91, 4, 368)

IrOP <- data.frame(Party, nos)

IrOP <- within(IrOP, {
  percwith <- nos/sum(nos)
  percnot <- nos/sum(nos[-7])})

IrOP
##         Party nos     percnot    percwith
## 1   Fine Gael 181 0.293354943 0.183756345
## 2          La  51 0.082658023 0.051776650
## 3 Fianna Fail 171 0.277147488 0.173604061
## 4   Sinn Fein 119 0.192868720 0.120812183
## 5          In  91 0.147487844 0.092385787
## 6       Green   4 0.006482982 0.004060914
## 7  Don’t know 368 0.596434360 0.373604061
par(mfrow=c(2,1), mar = c(2.1, 2.1, 2.1, 2.1))

with(IrOP, pie(percwith, labels=Party, clockwise=TRUE,
               col=c("blue", "red", "darkgreen", "black",
                     "grey", "lightgreen", "white"), radius=1))

with(IrOP, pie(percnot[-7], labels=Party, clockwise=TRUE,
               col=c("blue", "red", "darkgreen", "black",
                     "grey", "lightgreen"), radius=1))

#Valores en pie
    
# Pie Chart from data frame with Appended Sample Sizes

mytable <- table(iris$Species)

lbls <- paste(names(mytable), "\n", mytable, sep="")

par(mfrow=c(1,1), mar = c(2.1, 2.1, 2.1, 2.1))

pie(mytable, labels = lbls, 
    main="Pie Chart of Species\n (with sample sizes)")

# No son recomendados ?pie    

Encuesta a estudiantes de estadística

data(survey, package="MASS")

class(survey)
## [1] "data.frame"
str(survey)
## 'data.frame':    237 obs. of  12 variables:
##  $ Sex   : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 1 2 1 2 2 ...
##  $ Wr.Hnd: num  18.5 19.5 18 18.8 20 18 17.7 17 20 18.5 ...
##  $ NW.Hnd: num  18 20.5 13.3 18.9 20 17.7 17.7 17.3 19.5 18.5 ...
##  $ W.Hnd : Factor w/ 2 levels "Left","Right": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Fold  : Factor w/ 3 levels "L on R","Neither",..: 3 3 1 3 2 1 1 3 3 3 ...
##  $ Pulse : int  92 104 87 NA 35 64 83 74 72 90 ...
##  $ Clap  : Factor w/ 3 levels "Left","Neither",..: 1 1 2 2 3 3 3 3 3 3 ...
##  $ Exer  : Factor w/ 3 levels "Freq","None",..: 3 2 2 2 3 3 1 1 3 3 ...
##  $ Smoke : Factor w/ 4 levels "Heavy","Never",..: 2 4 3 2 2 2 2 2 2 2 ...
##  $ Height: num  173 178 NA 160 165 ...
##  $ M.I   : Factor w/ 2 levels "Imperial","Metric": 2 1 NA 2 2 1 1 2 2 2 ...
##  $ Age   : num  18.2 17.6 16.9 20.3 23.7 ...
glimpse(survey)
## Observations: 237
## Variables: 12
## $ Sex    <fct> Female, Male, Male, Male, Male, Female, Male, Female, M...
## $ Wr.Hnd <dbl> 18.5, 19.5, 18.0, 18.8, 20.0, 18.0, 17.7, 17.0, 20.0, 1...
## $ NW.Hnd <dbl> 18.0, 20.5, 13.3, 18.9, 20.0, 17.7, 17.7, 17.3, 19.5, 1...
## $ W.Hnd  <fct> Right, Left, Right, Right, Right, Right, Right, Right, ...
## $ Fold   <fct> R on L, R on L, L on R, R on L, Neither, L on R, L on R...
## $ Pulse  <int> 92, 104, 87, NA, 35, 64, 83, 74, 72, 90, 80, 68, NA, 66...
## $ Clap   <fct> Left, Left, Neither, Neither, Right, Right, Right, Righ...
## $ Exer   <fct> Some, None, None, None, Some, Some, Freq, Freq, Some, S...
## $ Smoke  <fct> Never, Regul, Occas, Never, Never, Never, Never, Never,...
## $ Height <dbl> 173.00, 177.80, NA, 160.00, 165.00, 172.72, 182.88, 157...
## $ M.I    <fct> Metric, Imperial, NA, Metric, Metric, Imperial, Imperia...
## $ Age    <dbl> 18.250, 17.583, 16.917, 20.333, 23.667, 21.000, 18.833,...
summary(survey)
##      Sex          Wr.Hnd          NW.Hnd        W.Hnd          Fold    
##  Female:118   Min.   :13.00   Min.   :12.50   Left : 18   L on R : 99  
##  Male  :118   1st Qu.:17.50   1st Qu.:17.50   Right:218   Neither: 18  
##  NA's  :  1   Median :18.50   Median :18.50   NA's :  1   R on L :120  
##               Mean   :18.67   Mean   :18.58                            
##               3rd Qu.:19.80   3rd Qu.:19.73                            
##               Max.   :23.20   Max.   :23.50                            
##               NA's   :1       NA's   :1                                
##      Pulse             Clap       Exer       Smoke         Height     
##  Min.   : 35.00   Left   : 39   Freq:115   Heavy: 11   Min.   :150.0  
##  1st Qu.: 66.00   Neither: 50   None: 24   Never:189   1st Qu.:165.0  
##  Median : 72.50   Right  :147   Some: 98   Occas: 19   Median :171.0  
##  Mean   : 74.15   NA's   :  1              Regul: 17   Mean   :172.4  
##  3rd Qu.: 80.00                            NA's :  1   3rd Qu.:180.0  
##  Max.   :104.00                                        Max.   :200.0  
##  NA's   :45                                            NA's   :28     
##        M.I           Age       
##  Imperial: 68   Min.   :16.75  
##  Metric  :141   1st Qu.:17.67  
##  NA's    : 28   Median :18.58  
##                 Mean   :20.37  
##                 3rd Qu.:20.17  
##                 Max.   :73.00  
## 
s1 <- ggplot(survey, aes(Sex)) + geom_bar() + ylab("")
s2 <- ggplot(survey, aes(W.Hnd)) + geom_bar() +
    xlab("Writing hand") + ylab("")
s3 <- ggplot(survey, aes(Fold)) + geom_bar() + 
    xlab("Folding arms: arm on top") + ylab("")
s4 <- ggplot(survey, aes(Clap)) + geom_bar() +
    xlab("Clapping: hand on top") + ylab("")
survey <- within(survey,
    ExerN <- factor(Exer, levels=c("None", "Some", "Freq")))
s5 <- ggplot(survey, aes(ExerN)) + geom_bar() + 
    xlab("Exercise") + ylab("")
s6 <- ggplot(survey, aes(M.I)) + geom_bar() +
    xlab("Height units") + ylab("")
survey <- within(survey, SmokeN <- factor(Smoke, levels=c("Never", "Occas", "Regul", "Heavy")))
s7 <- ggplot(survey, aes(SmokeN)) + geom_bar() +
      xlab("Smoking") + ylab("")

grid.arrange(s1, s2, s3, s4, s5, s6, s7, ncol=3)

s1 <- ggplot(subset(survey, !is.na(Sex)), aes(Sex)) + geom_bar() + ylab("")

grid.arrange(s1, s2, s3, s4, s5, s6, s7, ncol=3)

summary(survey$W.Hnd)
##  Left Right  NA's 
##    18   218     1
# Goles en futbol

data(UKSoccer, package="vcd")

class(UKSoccer)
## [1] "table"
str(UKSoccer)
##  table [1:5, 1:5] 27 59 28 19 7 29 53 32 14 8 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ Home: chr [1:5] "0" "1" "2" "3" ...
##   ..$ Away: chr [1:5] "0" "1" "2" "3" ...
glimpse(UKSoccer)
##  table [1:5, 1:5] 27 59 28 19 7 29 53 32 14 8 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ Home: chr [1:5] "0" "1" "2" "3" ...
##   ..$ Away: chr [1:5] "0" "1" "2" "3" ...
summary(UKSoccer)
## Number of cases in table: 380 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 18.699, df = 16, p-value = 0.2846
##  Chi-squared approximation may be incorrect
PL <- data.frame(UKSoccer)
lx <- c("0", "1", "2", "3", "4 or more")
b1 <- ggplot(PL, aes(x=factor(Home), weight=Freq)) +
  geom_bar(fill="firebrick1") +
  ylab("") + xlab("Home Goals") +
  scale_x_discrete(labels=lx) + ylim(0,150)
b2 <- ggplot(PL, aes(x=factor(Away), weight=Freq)) +
  geom_bar(fill="cyan1") +
  ylab("") + xlab("Away Goals") + 
  scale_x_discrete(labels=lx) + ylim(0,150) 

grid.arrange(b1, b2, nrow=1)

# Benford's Law

xx <- 1:9
Ben <- data.frame(xx, pdf=log10(1+1/xx))
ggplot(Ben, aes(factor(xx), weight=pdf)) + geom_bar() + 
  xlab("") + ylab("") + ylim(0,0.35)

Peso y altura en las olimpiadas

library(VGAMdata)
data(oly12, package="VGAMdata")

class(oly12)
## [1] "data.frame"
str(oly12)
## 'data.frame':    10384 obs. of  14 variables:
##  $ Name   : Factor w/ 10366 levels "Aaron Brown",..: 5353 121 4117 16 6033 5686 6061 6765 2738 3854 ...
##  $ Country: Factor w/ 205 levels "Afghanistan",..: 144 195 68 125 154 68 8 125 94 3 ...
##  $ Age    : int  23 33 30 24 26 27 30 23 27 19 ...
##  $ Height : num  1.7 1.93 1.87 NA 1.78 1.82 1.82 1.87 1.9 1.7 ...
##  $ Weight : int  60 125 76 NA 85 80 73 75 80 NA ...
##  $ Sex    : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 2 2 2 ...
##  $ DOB    : Date, format: "1989-02-06" NA ...
##  $ PlaceOB: Factor w/ 4108 levels "","Aachen (GER)",..: 2486 3302 398 48 3436 1 1 1 1172 2266 ...
##  $ Gold   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Silver : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Bronze : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Total  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sport  : Factor w/ 42 levels "Archery","Athletics",..: 27 2 2 7 2 25 29 22 2 7 ...
##  $ Event  : Factor w/ 763 levels "Group All-Around",..: 350 405 251 443 699 406 726 403 248 491 ...
glimpse(oly12)
## Observations: 10,384
## Variables: 14
## $ Name    <fct> Lamusi A, A G Kruger, Jamale Aarrass, Abdelhak Aatakni...
## $ Country <fct> People's Republic of China, United States of America, ...
## $ Age     <int> 23, 33, 30, 24, 26, 27, 30, 23, 27, 19, 37, 28, 28, 28...
## $ Height  <dbl> 1.70, 1.93, 1.87, NA, 1.78, 1.82, 1.82, 1.87, 1.90, 1....
## $ Weight  <int> 60, 125, 76, NA, 85, 80, 73, 75, 80, NA, NA, NA, 60, 6...
## $ Sex     <fct> M, M, M, M, F, M, F, M, M, M, M, M, F, F, M, F, M, M, ...
## $ DOB     <date> 1989-02-06, NA, NA, 1988-09-02, NA, 1984-06-09, NA, 1...
## $ PlaceOB <fct> NEIMONGGOL (CHN), Sheldon (USA), BEZONS (FRA), AIN SEB...
## $ Gold    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Silver  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Bronze  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Total   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Sport   <fct> Judo, Athletics, Athletics, Boxing, Athletics, Handbal...
## $ Event   <fct> Men's -60kg, Men's Hammer Throw, Men's 1500m, Men's Li...
summary(oly12)
##                      Name                             Country    
##  Lei Zhang             :    3   Great Britain             : 523  
##  Ling Li               :    3   United States of America  : 518  
##  Aleksandar Aleksandrov:    2   Russian Federation        : 414  
##  Hao Wang              :    2   Australia                 : 399  
##  Ivan Zaytsev          :    2   Germany                   : 385  
##  Maria Gromova         :    2   People's Republic of China: 337  
##  (Other)               :10370   (Other)                   :7808  
##       Age            Height          Weight       Sex     
##  Min.   :13.00   Min.   :1.320   Min.   : 36.00   F:4628  
##  1st Qu.:22.00   1st Qu.:1.690   1st Qu.: 61.00   M:5756  
##  Median :25.00   Median :1.770   Median : 70.00           
##  Mean   :26.07   Mean   :1.769   Mean   : 72.85           
##  3rd Qu.:29.00   3rd Qu.:1.850   3rd Qu.: 81.00           
##  Max.   :71.00   Max.   :2.210   Max.   :218.00           
##                  NA's   :561     NA's   :1280             
##       DOB                           PlaceOB          Gold        
##  Min.   :1947-06-01                     :2690   Min.   :0.00000  
##  1st Qu.:1983-01-02   Seoul (KOR)       :  57   1st Qu.:0.00000  
##  Median :1986-09-11   Budapest (HUN)    :  54   Median :0.00000  
##  Mean   :1986-01-09   Moscow (RUS)      :  50   Mean   :0.01666  
##  3rd Qu.:1989-11-10   Shandong (CHN)    :  37   3rd Qu.:0.00000  
##  Max.   :1997-07-09   Buenos Aires (ARG):  31   Max.   :2.00000  
##  NA's   :6192         (Other)           :7465                    
##      Silver            Bronze            Total              Sport     
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Athletics:2119  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   Swimming : 907  
##  Median :0.00000   Median :0.00000   Median :0.0000   Football : 596  
##  Mean   :0.01705   Mean   :0.01849   Mean   :0.0522   Rowing   : 524  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   Hockey   : 416  
##  Max.   :2.00000   Max.   :2.00000   Max.   :5.0000   Judo     : 368  
##                                                       (Other)  :5454  
##               Event     
##  Men's Football  : 336  
##  Women's Football: 260  
##  Women's Hockey  : 210  
##  Men's Hockey    : 206  
##  Men's Handball  : 160  
##  Women's Handball: 159  
##  (Other)         :9053
ggplot(oly12, aes(Height, Weight)) + geom_point() + 
  ggtitle("Athletes at the London Olympics 2012")
## Warning: Removed 1346 rows containing missing values (geom_point).

¿Los males de la bebida?

library(HistData)
data(DrinksWages, package="HistData")

class(DrinksWages)
## [1] "data.frame"
str(DrinksWages)
## 'data.frame':    70 obs. of  6 variables:
##  $ class : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
##  $ trade : Factor w/ 70 levels "baker","barman",..: 38 10 25 55 36 44 68 34 14 11 ...
##  $ sober : int  1 1 2 1 2 9 8 3 0 12 ...
##  $ drinks: int  1 10 1 5 0 8 2 5 7 23 ...
##  $ wage  : num  24 18.4 21.5 21.2 19 ...
##  $ n     : int  2 11 3 6 2 17 10 8 7 35 ...
glimpse(DrinksWages)
## Observations: 70
## Variables: 6
## $ class  <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A...
## $ trade  <fct> papercutter, cabmen, goldbeater, stablemen, millworker,...
## $ sober  <int> 1, 1, 2, 1, 2, 9, 8, 3, 0, 12, 1, 2, 11, 2, 1, 1, 1, 1,...
## $ drinks <int> 1, 10, 1, 5, 0, 8, 2, 5, 7, 23, 0, 4, 9, 2, 1, 2, 0, 0,...
## $ wage   <dbl> 24.00000, 18.41667, 21.50000, 21.16667, 19.00000, 20.50...
## $ n      <int> 2, 11, 3, 6, 2, 17, 10, 8, 7, 35, 1, 6, 20, 4, 2, 3, 1,...
summary(DrinksWages)
##  class         trade        sober           drinks            wage      
##  A:25   baker     : 1   Min.   : 0.00   Min.   : 0.000   Min.   :12.00  
##  B:22   barman    : 1   1st Qu.: 1.00   1st Qu.: 0.250   1st Qu.:22.83  
##  C:23   billposter: 1   Median : 2.00   Median : 2.000   Median :27.00  
##         blacksmith: 1   Mean   : 3.90   Mean   : 4.729   Mean   :27.03  
##         bookbinder: 1   3rd Qu.: 3.75   3rd Qu.: 5.000   3rd Qu.:31.08  
##         bottler   : 1   Max.   :71.00   Max.   :85.000   Max.   :40.00  
##         (Other)   :64                                                   
##        n          
##  Min.   :  1.000  
##  1st Qu.:  2.000  
##  Median :  3.500  
##  Mean   :  8.629  
##  3rd Qu.:  8.000  
##  Max.   :156.000  
## 
ggplot(DrinksWages, aes(drinks/n, wage)) + geom_point() + 
  xlab("Proportion of drinkers") + xlim(0,1) + ylim(0,40)

with(DrinksWages, hist(n, breaks=0:max(n)))

with(DrinksWages, table(n))
## n
##   1   2   3   4   5   6   7   8   9  10  11  12  17  19  20  21  22  29 
##  16  10   9   6   2   4   2   4   4   1   1   2   1   1   1   1   1   1 
##  35  37 156 
##   1   1   1
with(DrinksWages, max(n[drinks==0]))
## [1] 5
with(DrinksWages, trade[drinks==0 & n==max(n[drinks==0])]) 
## [1] gasworker
## 70 Levels: baker barman billposter blacksmith bookbinder ... wireworker
with(DrinksWages, max(n[sober==0]))
## [1] 7
with(DrinksWages, trade[sober==0 & n==max(n[sober==0])])
## [1] chimneysweep
## 70 Levels: baker barman billposter blacksmith bookbinder ... wireworker
bigDW <- filter(DrinksWages, n > 4)
ggplot(bigDW, aes(drinks/n, wage)) + geom_point() + 
  xlab("Proportion of drinkers") + xlim(0,1) + ylim(0,40)

Old Faithful

data(geyser, package="MASS")

class(geyser)
## [1] "data.frame"
str(geyser)
## 'data.frame':    299 obs. of  2 variables:
##  $ waiting : num  80 71 57 80 75 77 60 86 77 56 ...
##  $ duration: num  4.02 2.15 4 4 4 ...
glimpse(geyser)
## Observations: 299
## Variables: 2
## $ waiting  <dbl> 80, 71, 57, 80, 75, 77, 60, 86, 77, 56, 81, 50, 89, 5...
## $ duration <dbl> 4.016667, 2.150000, 4.000000, 4.000000, 4.000000, 2.0...
summary(geyser)
##     waiting          duration     
##  Min.   : 43.00   Min.   :0.8333  
##  1st Qu.: 59.00   1st Qu.:2.0000  
##  Median : 76.00   Median :4.0000  
##  Mean   : 72.31   Mean   :3.4608  
##  3rd Qu.: 83.00   3rd Qu.:4.3833  
##  Max.   :108.00   Max.   :5.4500
ggplot(geyser, aes(duration, waiting)) + geom_point()

ggplot(geyser, aes(duration, waiting)) + geom_point() + 
  geom_density2d()

library(hdrcde)
## This is hdrcde 3.2
par(mfrow=c(1,1))
par(mar=c(3.1, 4.1, 1.1, 2.1))
with(geyser, hdr.boxplot.2d(duration, waiting,
                            show.points=TRUE, prob=c(0.01,0.05,0.5,0.75)))

with(geyser, hdr.boxplot.2d(duration, waiting,
                            show.points=TRUE, prob=c(0.01)))

Coches

data(Cars93, package="MASS")

class(Cars93)
## [1] "data.frame"
str(Cars93)
## 'data.frame':    93 obs. of  27 variables:
##  $ Manufacturer      : Factor w/ 32 levels "Acura","Audi",..: 1 1 2 2 3 4 4 4 4 5 ...
##  $ Model             : Factor w/ 93 levels "100","190E","240",..: 49 56 9 1 6 24 54 74 73 35 ...
##  $ Type              : Factor w/ 6 levels "Compact","Large",..: 4 3 1 3 3 3 2 2 3 2 ...
##  $ Min.Price         : num  12.9 29.2 25.9 30.8 23.7 14.2 19.9 22.6 26.3 33 ...
##  $ Price             : num  15.9 33.9 29.1 37.7 30 15.7 20.8 23.7 26.3 34.7 ...
##  $ Max.Price         : num  18.8 38.7 32.3 44.6 36.2 17.3 21.7 24.9 26.3 36.3 ...
##  $ MPG.city          : int  25 18 20 19 22 22 19 16 19 16 ...
##  $ MPG.highway       : int  31 25 26 26 30 31 28 25 27 25 ...
##  $ AirBags           : Factor w/ 3 levels "Driver & Passenger",..: 3 1 2 1 2 2 2 2 2 2 ...
##  $ DriveTrain        : Factor w/ 3 levels "4WD","Front",..: 2 2 2 2 3 2 2 3 2 2 ...
##  $ Cylinders         : Factor w/ 6 levels "3","4","5","6",..: 2 4 4 4 2 2 4 4 4 5 ...
##  $ EngineSize        : num  1.8 3.2 2.8 2.8 3.5 2.2 3.8 5.7 3.8 4.9 ...
##  $ Horsepower        : int  140 200 172 172 208 110 170 180 170 200 ...
##  $ RPM               : int  6300 5500 5500 5500 5700 5200 4800 4000 4800 4100 ...
##  $ Rev.per.mile      : int  2890 2335 2280 2535 2545 2565 1570 1320 1690 1510 ...
##  $ Man.trans.avail   : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 1 1 1 1 ...
##  $ Fuel.tank.capacity: num  13.2 18 16.9 21.1 21.1 16.4 18 23 18.8 18 ...
##  $ Passengers        : int  5 5 5 6 4 6 6 6 5 6 ...
##  $ Length            : int  177 195 180 193 186 189 200 216 198 206 ...
##  $ Wheelbase         : int  102 115 102 106 109 105 111 116 108 114 ...
##  $ Width             : int  68 71 67 70 69 69 74 78 73 73 ...
##  $ Turn.circle       : int  37 38 37 37 39 41 42 45 41 43 ...
##  $ Rear.seat.room    : num  26.5 30 28 31 27 28 30.5 30.5 26.5 35 ...
##  $ Luggage.room      : int  11 15 14 17 13 16 17 21 14 18 ...
##  $ Weight            : int  2705 3560 3375 3405 3640 2880 3470 4105 3495 3620 ...
##  $ Origin            : Factor w/ 2 levels "USA","non-USA": 2 2 2 2 2 1 1 1 1 1 ...
##  $ Make              : Factor w/ 93 levels "Acura Integra",..: 1 2 4 3 5 6 7 9 8 10 ...
glimpse(Cars93)
## Observations: 93
## Variables: 27
## $ Manufacturer       <fct> Acura, Acura, Audi, Audi, BMW, Buick, Buick...
## $ Model              <fct> Integra, Legend, 90, 100, 535i, Century, Le...
## $ Type               <fct> Small, Midsize, Compact, Midsize, Midsize, ...
## $ Min.Price          <dbl> 12.9, 29.2, 25.9, 30.8, 23.7, 14.2, 19.9, 2...
## $ Price              <dbl> 15.9, 33.9, 29.1, 37.7, 30.0, 15.7, 20.8, 2...
## $ Max.Price          <dbl> 18.8, 38.7, 32.3, 44.6, 36.2, 17.3, 21.7, 2...
## $ MPG.city           <int> 25, 18, 20, 19, 22, 22, 19, 16, 19, 16, 16,...
## $ MPG.highway        <int> 31, 25, 26, 26, 30, 31, 28, 25, 27, 25, 25,...
## $ AirBags            <fct> None, Driver & Passenger, Driver only, Driv...
## $ DriveTrain         <fct> Front, Front, Front, Front, Rear, Front, Fr...
## $ Cylinders          <fct> 4, 6, 6, 6, 4, 4, 6, 6, 6, 8, 8, 4, 4, 6, 4...
## $ EngineSize         <dbl> 1.8, 3.2, 2.8, 2.8, 3.5, 2.2, 3.8, 5.7, 3.8...
## $ Horsepower         <int> 140, 200, 172, 172, 208, 110, 170, 180, 170...
## $ RPM                <int> 6300, 5500, 5500, 5500, 5700, 5200, 4800, 4...
## $ Rev.per.mile       <int> 2890, 2335, 2280, 2535, 2545, 2565, 1570, 1...
## $ Man.trans.avail    <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, No, No...
## $ Fuel.tank.capacity <dbl> 13.2, 18.0, 16.9, 21.1, 21.1, 16.4, 18.0, 2...
## $ Passengers         <int> 5, 5, 5, 6, 4, 6, 6, 6, 5, 6, 5, 5, 5, 4, 6...
## $ Length             <int> 177, 195, 180, 193, 186, 189, 200, 216, 198...
## $ Wheelbase          <int> 102, 115, 102, 106, 109, 105, 111, 116, 108...
## $ Width              <int> 68, 71, 67, 70, 69, 69, 74, 78, 73, 73, 74,...
## $ Turn.circle        <int> 37, 38, 37, 37, 39, 41, 42, 45, 41, 43, 44,...
## $ Rear.seat.room     <dbl> 26.5, 30.0, 28.0, 31.0, 27.0, 28.0, 30.5, 3...
## $ Luggage.room       <int> 11, 15, 14, 17, 13, 16, 17, 21, 14, 18, 14,...
## $ Weight             <int> 2705, 3560, 3375, 3405, 3640, 2880, 3470, 4...
## $ Origin             <fct> non-USA, non-USA, non-USA, non-USA, non-USA...
## $ Make               <fct> Acura Integra, Acura Legend, Audi 90, Audi ...
summary(Cars93)
##     Manufacturer     Model         Type      Min.Price         Price      
##  Chevrolet: 8    100    : 1   Compact:16   Min.   : 6.70   Min.   : 7.40  
##  Ford     : 8    190E   : 1   Large  :11   1st Qu.:10.80   1st Qu.:12.20  
##  Dodge    : 6    240    : 1   Midsize:22   Median :14.70   Median :17.70  
##  Mazda    : 5    300E   : 1   Small  :21   Mean   :17.13   Mean   :19.51  
##  Pontiac  : 5    323    : 1   Sporty :14   3rd Qu.:20.30   3rd Qu.:23.30  
##  Buick    : 4    535i   : 1   Van    : 9   Max.   :45.40   Max.   :61.90  
##  (Other)  :57    (Other):87                                               
##    Max.Price       MPG.city      MPG.highway                  AirBags  
##  Min.   : 7.9   Min.   :15.00   Min.   :20.00   Driver & Passenger:16  
##  1st Qu.:14.7   1st Qu.:18.00   1st Qu.:26.00   Driver only       :43  
##  Median :19.6   Median :21.00   Median :28.00   None              :34  
##  Mean   :21.9   Mean   :22.37   Mean   :29.09                          
##  3rd Qu.:25.3   3rd Qu.:25.00   3rd Qu.:31.00                          
##  Max.   :80.0   Max.   :46.00   Max.   :50.00                          
##                                                                        
##  DriveTrain  Cylinders    EngineSize      Horsepower         RPM      
##  4WD  :10   3     : 3   Min.   :1.000   Min.   : 55.0   Min.   :3800  
##  Front:67   4     :49   1st Qu.:1.800   1st Qu.:103.0   1st Qu.:4800  
##  Rear :16   5     : 2   Median :2.400   Median :140.0   Median :5200  
##             6     :31   Mean   :2.668   Mean   :143.8   Mean   :5281  
##             8     : 7   3rd Qu.:3.300   3rd Qu.:170.0   3rd Qu.:5750  
##             rotary: 1   Max.   :5.700   Max.   :300.0   Max.   :6500  
##                                                                       
##   Rev.per.mile  Man.trans.avail Fuel.tank.capacity   Passengers   
##  Min.   :1320   No :32          Min.   : 9.20      Min.   :2.000  
##  1st Qu.:1985   Yes:61          1st Qu.:14.50      1st Qu.:4.000  
##  Median :2340                   Median :16.40      Median :5.000  
##  Mean   :2332                   Mean   :16.66      Mean   :5.086  
##  3rd Qu.:2565                   3rd Qu.:18.80      3rd Qu.:6.000  
##  Max.   :3755                   Max.   :27.00      Max.   :8.000  
##                                                                   
##      Length        Wheelbase         Width        Turn.circle   
##  Min.   :141.0   Min.   : 90.0   Min.   :60.00   Min.   :32.00  
##  1st Qu.:174.0   1st Qu.: 98.0   1st Qu.:67.00   1st Qu.:37.00  
##  Median :183.0   Median :103.0   Median :69.00   Median :39.00  
##  Mean   :183.2   Mean   :103.9   Mean   :69.38   Mean   :38.96  
##  3rd Qu.:192.0   3rd Qu.:110.0   3rd Qu.:72.00   3rd Qu.:41.00  
##  Max.   :219.0   Max.   :119.0   Max.   :78.00   Max.   :45.00  
##                                                                 
##  Rear.seat.room   Luggage.room       Weight         Origin  
##  Min.   :19.00   Min.   : 6.00   Min.   :1695   USA    :48  
##  1st Qu.:26.00   1st Qu.:12.00   1st Qu.:2620   non-USA:45  
##  Median :27.50   Median :14.00   Median :3040               
##  Mean   :27.83   Mean   :13.89   Mean   :3073               
##  3rd Qu.:30.00   3rd Qu.:15.00   3rd Qu.:3525               
##  Max.   :36.00   Max.   :22.00   Max.   :4105               
##  NA's   :2       NA's   :11                                 
##             Make   
##  Acura Integra: 1  
##  Acura Legend : 1  
##  Audi 100     : 1  
##  Audi 90      : 1  
##  BMW 535i     : 1  
##  Buick Century: 1  
##  (Other)      :87
ggplot(Cars93, aes(Weight, MPG.city)) + geom_point() + 
  geom_smooth(colour="green") + ylim(0,50)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Alturas de Pearson

data(father.son, package="UsingR")

class(father.son)
## [1] "data.frame"
str(father.son)
## 'data.frame':    1078 obs. of  2 variables:
##  $ fheight: num  65 63.3 65 65.8 61.1 ...
##  $ sheight: num  59.8 63.2 63.3 62.8 64.3 ...
glimpse(father.son)
## Observations: 1,078
## Variables: 2
## $ fheight <dbl> 65.04851, 63.25094, 64.95532, 65.75250, 61.13723, 63.0...
## $ sheight <dbl> 59.77827, 63.21404, 63.34242, 62.79238, 64.28113, 64.2...
summary(father.son)
##     fheight         sheight     
##  Min.   :59.01   Min.   :58.51  
##  1st Qu.:65.79   1st Qu.:66.93  
##  Median :67.77   Median :68.62  
##  Mean   :67.69   Mean   :68.68  
##  3rd Qu.:69.60   3rd Qu.:70.47  
##  Max.   :75.43   Max.   :78.36
ggplot(father.son, aes(fheight, sheight)) + geom_point() + 
  geom_smooth(method="lm", colour="red") + 
  geom_abline(slope=1, intercept=0)

data(father.son, package="UsingR")
m1 <- lm(sheight~fheight, father.son)
summary(m1)
## 
## Call:
## lm(formula = sheight ~ fheight, data = father.son)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8772 -1.5144 -0.0079  1.6285  8.9685 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.88660    1.83235   18.49   <2e-16 ***
## fheight      0.51409    0.02705   19.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.437 on 1076 degrees of freedom
## Multiple R-squared:  0.2513, Adjusted R-squared:  0.2506 
## F-statistic: 361.2 on 1 and 1076 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(m1)

data(father.son, package="UsingR")
ggplot(father.son, aes(fheight, sheight)) + geom_point() + 
  geom_smooth(method="lm", colour="red", se=FALSE) + 
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Atletas olímpicos

ggplot(oly12, aes(Height, Weight)) +
  geom_point(size = 1) + facet_wrap(~Sex, ncol=1)
## Warning: Removed 1346 rows containing missing values (geom_point).

oly12S <- within(oly12, Sport <- abbreviate(Sport, 12)) 
ggplot(oly12S, aes(Height, Weight)) +
  geom_point(size = 1) + facet_wrap(~Sport) + 
  ggtitle("Weight and Height by Sport")
## Warning: Removed 1346 rows containing missing values (geom_point).

oly12JWW <- filter(oly12, Sport %in%
                     c("Judo", "Weightlifting", "Wrestling")) 
ggplot(oly12JWW, aes(Height, Weight)) + 
  geom_point(size = 1) + facet_wrap(~Sport) + 
  ggtitle("Weight and Height by Sport")
## Warning: Removed 69 rows containing missing values (geom_point).

Crímenes

library(GGally)
library(VGAMdata)
data(crime.us, package="VGAMdata")

class(crime.us)
## [1] "data.frame"
str(crime.us)
## 'data.frame':    50 obs. of  22 variables:
##  $ State                : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Population           : int  4708708 698473 6595778 2889450 36961664 5024748 3518288 885122 18537969 9829211 ...
##  $ ViolentCrimeTotal    : int  21179 4421 26929 14959 174459 16976 10508 5635 113541 41880 ...
##  $ Murder               : int  323 22 354 179 1972 175 107 41 1017 566 ...
##  $ Rape                 : int  1504 512 2110 1368 8713 2242 651 338 5501 2301 ...
##  $ Robbery              : int  6259 655 8099 2582 64093 3387 3990 1671 30911 14603 ...
##  $ Assault              : int  13093 3232 16366 10830 99681 11172 5760 3585 76112 24410 ...
##  $ PropertyCrimeTotal   : int  177629 20577 234582 109038 1009614 133968 82181 29648 712010 360400 ...
##  $ Burglary             : int  48837 3597 53412 34764 230137 26649 15073 6932 181884 98362 ...
##  $ LarcenyTheft         : int  117711 15291 155184 68171 615456 94861 59632 20809 479867 228893 ...
##  $ MotorVehicleTheft    : int  11081 1689 25986 6103 164021 12458 7476 1907 50259 33145 ...
##  $ ViolentCrimeRate     : num  450 633 408 518 472 ...
##  $ MurderRate           : num  6.9 3.1 5.4 6.2 5.3 3.5 3 4.6 5.5 5.8 ...
##  $ RapeRate             : num  31.9 73.3 32 47.3 23.6 44.6 18.5 38.2 29.7 23.4 ...
##  $ RobberyRate          : num  132.9 93.8 122.8 89.4 173.4 ...
##  $ AssaultRate          : num  278 463 248 375 270 ...
##  $ PropertyCrimeRate    : num  3772 2946 3556 3774 2732 ...
##  $ BurglaryRate         : num  1037 515 810 1203 623 ...
##  $ LarcenyTheftRate     : num  2500 2189 2353 2359 1665 ...
##  $ MotorVehicleTheftRate: num  235 242 394 211 444 ...
##  $ stateNumber          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ abbrev               : chr  "AL" "AK" "AZ" "AR" ...
glimpse(crime.us)
## Observations: 50
## Variables: 22
## $ State                 <chr> "Alabama", "Alaska", "Arizona", "Arkansa...
## $ Population            <int> 4708708, 698473, 6595778, 2889450, 36961...
## $ ViolentCrimeTotal     <int> 21179, 4421, 26929, 14959, 174459, 16976...
## $ Murder                <int> 323, 22, 354, 179, 1972, 175, 107, 41, 1...
## $ Rape                  <int> 1504, 512, 2110, 1368, 8713, 2242, 651, ...
## $ Robbery               <int> 6259, 655, 8099, 2582, 64093, 3387, 3990...
## $ Assault               <int> 13093, 3232, 16366, 10830, 99681, 11172,...
## $ PropertyCrimeTotal    <int> 177629, 20577, 234582, 109038, 1009614, ...
## $ Burglary              <int> 48837, 3597, 53412, 34764, 230137, 26649...
## $ LarcenyTheft          <int> 117711, 15291, 155184, 68171, 615456, 94...
## $ MotorVehicleTheft     <int> 11081, 1689, 25986, 6103, 164021, 12458,...
## $ ViolentCrimeRate      <dbl> 449.8, 633.0, 408.3, 517.7, 472.0, 337.8...
## $ MurderRate            <dbl> 6.9, 3.1, 5.4, 6.2, 5.3, 3.5, 3.0, 4.6, ...
## $ RapeRate              <dbl> 31.9, 73.3, 32.0, 47.3, 23.6, 44.6, 18.5...
## $ RobberyRate           <dbl> 132.9, 93.8, 122.8, 89.4, 173.4, 67.4, 1...
## $ AssaultRate           <dbl> 278.1, 462.7, 248.1, 374.8, 269.7, 222.3...
## $ PropertyCrimeRate     <dbl> 3772.4, 2946.0, 3556.5, 3773.7, 2731.5, ...
## $ BurglaryRate          <dbl> 1037.2, 515.0, 809.8, 1203.1, 622.6, 530...
## $ LarcenyTheftRate      <dbl> 2499.9, 2189.2, 2352.8, 2359.3, 1665.1, ...
## $ MotorVehicleTheftRate <dbl> 235.3, 241.8, 394.0, 211.2, 443.8, 247.9...
## $ stateNumber           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1...
## $ abbrev                <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT"...
summary(crime.us)
##     State             Population       ViolentCrimeTotal     Murder       
##  Length:50          Min.   :  544270   Min.   :   817    Min.   :   7.00  
##  Class :character   1st Qu.: 1802408   1st Qu.:  5456    1st Qu.:  37.75  
##  Mode  :character   Median : 4403094   Median : 15968    Median : 176.50  
##                     Mean   : 6128138   Mean   : 26207    Mean   : 301.94  
##                     3rd Qu.: 6647091   3rd Qu.: 30481    3rd Qu.: 424.25  
##                     Max.   :36961664   Max.   :174459    Max.   :1972.00  
##       Rape           Robbery         Assault      PropertyCrimeTotal
##  Min.   : 124.0   Min.   :   77   Min.   :  575   Min.   :  12502   
##  1st Qu.: 562.8   1st Qu.: 1201   1st Qu.: 3610   1st Qu.:  47968   
##  Median :1263.5   Median : 3810   Median :10297   Median : 132868   
##  Mean   :1758.9   Mean   : 8077   Mean   :16069   Mean   : 185850   
##  3rd Qu.:2080.8   3rd Qu.: 9260   3rd Qu.:20017   3rd Qu.: 226611   
##  Max.   :8713.0   Max.   :64093   Max.   :99681   Max.   :1009614   
##     Burglary       LarcenyTheft    MotorVehicleTheft ViolentCrimeRate
##  Min.   :  2230   Min.   :  9296   Min.   :   448    Min.   :119.8   
##  1st Qu.:  9871   1st Qu.: 34424   1st Qu.:  3583    1st Qu.:255.3   
##  Median : 29432   Median : 89563   Median : 10136    Median :335.5   
##  Mean   : 43909   Mean   :126160   Mean   : 15782    Mean   :382.0   
##  3rd Qu.: 51821   3rd Qu.:153502   3rd Qu.: 17736    3rd Qu.:495.7   
##  Max.   :240233   Max.   :678353   Max.   :164021    Max.   :702.2   
##    MurderRate        RapeRate      RobberyRate      AssaultRate   
##  Min.   : 0.800   Min.   :12.00   Min.   : 13.70   Min.   : 59.0  
##  1st Qu.: 2.525   1st Qu.:26.05   1st Qu.: 65.08   1st Qu.:151.2  
##  Median : 4.300   Median :31.10   Median : 97.10   Median :213.8  
##  Mean   : 4.276   Mean   :32.08   Mean   :100.15   Mean   :245.5  
##  3rd Qu.: 5.875   3rd Qu.:35.23   3rd Qu.:135.35   3rd Qu.:320.3  
##  Max.   :11.800   Max.   :73.30   Max.   :227.80   Max.   :503.4  
##  PropertyCrimeRate  BurglaryRate    LarcenyTheftRate MotorVehicleTheftRate
##  Min.   :1719      Min.   : 305.2   Min.   :1314     Min.   : 72.1        
##  1st Qu.:2438      1st Qu.: 484.1   1st Qu.:1728     1st Qu.:149.6        
##  Median :2892      Median : 636.1   Median :2039     Median :213.0        
##  Mean   :2941      Mean   : 678.6   Mean   :2039     Mean   :222.9        
##  3rd Qu.:3569      3rd Qu.: 885.2   3rd Qu.:2345     3rd Qu.:279.4        
##  Max.   :4016      Max.   :1203.1   Max.   :2737     Max.   :468.4        
##   stateNumber       abbrev         
##  Min.   : 1.00   Length:50         
##  1st Qu.:13.25   Class :character  
##  Median :25.50   Mode  :character  
##  Mean   :25.50                     
##  3rd Qu.:37.75                     
##  Max.   :50.00
crime.usR <- crime.us
names(crime.usR) <- gsub("*Rate", "", names(crime.usR)) 
names(crime.usR)[19:20] <- c("Larceny", "MotorVTheft") 
ggpairs(crime.usR[, c(13:16, 18:20)],
        title="Crime rates in the USA", 
        upper = list(contious='smooth_loess'),
        diag=list(continuous='densityDiag'), axisLabels='none')

Otro tipo de Sploms (Scatter Plot Matrix)

# niveles de correlación por color
class(USJudgeRatings)
## [1] "data.frame"
str(USJudgeRatings)
## 'data.frame':    43 obs. of  12 variables:
##  $ CONT: num  5.7 6.8 7.2 6.8 7.3 6.2 10.6 7 7.3 8.2 ...
##  $ INTG: num  7.9 8.9 8.1 8.8 6.4 8.8 9 5.9 8.9 7.9 ...
##  $ DMNR: num  7.7 8.8 7.8 8.5 4.3 8.7 8.9 4.9 8.9 6.7 ...
##  $ DILG: num  7.3 8.5 7.8 8.8 6.5 8.5 8.7 5.1 8.7 8.1 ...
##  $ CFMG: num  7.1 7.8 7.5 8.3 6 7.9 8.5 5.4 8.6 7.9 ...
##  $ DECI: num  7.4 8.1 7.6 8.5 6.2 8 8.5 5.9 8.5 8 ...
##  $ PREP: num  7.1 8 7.5 8.7 5.7 8.1 8.5 4.8 8.4 7.9 ...
##  $ FAMI: num  7.1 8 7.5 8.7 5.7 8 8.5 5.1 8.4 8.1 ...
##  $ ORAL: num  7.1 7.8 7.3 8.4 5.1 8 8.6 4.7 8.4 7.7 ...
##  $ WRIT: num  7 7.9 7.4 8.5 5.3 8 8.4 4.9 8.5 7.8 ...
##  $ PHYS: num  8.3 8.5 7.9 8.8 5.5 8.6 9.1 6.8 8.8 8.5 ...
##  $ RTEN: num  7.8 8.7 7.8 8.7 4.8 8.6 9 5 8.8 7.9 ...
glimpse(USJudgeRatings)
## Observations: 43
## Variables: 12
## $ CONT <dbl> 5.7, 6.8, 7.2, 6.8, 7.3, 6.2, 10.6, 7.0, 7.3, 8.2, 7.0, 6...
## $ INTG <dbl> 7.9, 8.9, 8.1, 8.8, 6.4, 8.8, 9.0, 5.9, 8.9, 7.9, 8.0, 8....
## $ DMNR <dbl> 7.7, 8.8, 7.8, 8.5, 4.3, 8.7, 8.9, 4.9, 8.9, 6.7, 7.6, 7....
## $ DILG <dbl> 7.3, 8.5, 7.8, 8.8, 6.5, 8.5, 8.7, 5.1, 8.7, 8.1, 7.4, 7....
## $ CFMG <dbl> 7.1, 7.8, 7.5, 8.3, 6.0, 7.9, 8.5, 5.4, 8.6, 7.9, 7.3, 7....
## $ DECI <dbl> 7.4, 8.1, 7.6, 8.5, 6.2, 8.0, 8.5, 5.9, 8.5, 8.0, 7.5, 7....
## $ PREP <dbl> 7.1, 8.0, 7.5, 8.7, 5.7, 8.1, 8.5, 4.8, 8.4, 7.9, 7.1, 6....
## $ FAMI <dbl> 7.1, 8.0, 7.5, 8.7, 5.7, 8.0, 8.5, 5.1, 8.4, 8.1, 7.2, 7....
## $ ORAL <dbl> 7.1, 7.8, 7.3, 8.4, 5.1, 8.0, 8.6, 4.7, 8.4, 7.7, 7.1, 7....
## $ WRIT <dbl> 7.0, 7.9, 7.4, 8.5, 5.3, 8.0, 8.4, 4.9, 8.5, 7.8, 7.2, 7....
## $ PHYS <dbl> 8.3, 8.5, 7.9, 8.8, 5.5, 8.6, 9.1, 6.8, 8.8, 8.5, 8.4, 6....
## $ RTEN <dbl> 7.8, 8.7, 7.8, 8.7, 4.8, 8.6, 9.0, 5.0, 8.8, 7.9, 7.7, 7....
summary(USJudgeRatings)
##       CONT             INTG            DMNR            DILG      
##  Min.   : 5.700   Min.   :5.900   Min.   :4.300   Min.   :5.100  
##  1st Qu.: 6.850   1st Qu.:7.550   1st Qu.:6.900   1st Qu.:7.150  
##  Median : 7.300   Median :8.100   Median :7.700   Median :7.800  
##  Mean   : 7.437   Mean   :8.021   Mean   :7.516   Mean   :7.693  
##  3rd Qu.: 7.900   3rd Qu.:8.550   3rd Qu.:8.350   3rd Qu.:8.450  
##  Max.   :10.600   Max.   :9.200   Max.   :9.000   Max.   :9.000  
##       CFMG            DECI            PREP            FAMI      
##  Min.   :5.400   Min.   :5.700   Min.   :4.800   Min.   :5.100  
##  1st Qu.:7.000   1st Qu.:7.100   1st Qu.:6.900   1st Qu.:6.950  
##  Median :7.600   Median :7.700   Median :7.700   Median :7.600  
##  Mean   :7.479   Mean   :7.565   Mean   :7.467   Mean   :7.488  
##  3rd Qu.:8.050   3rd Qu.:8.150   3rd Qu.:8.200   3rd Qu.:8.250  
##  Max.   :8.700   Max.   :8.800   Max.   :9.100   Max.   :9.100  
##       ORAL            WRIT            PHYS            RTEN      
##  Min.   :4.700   Min.   :4.900   Min.   :4.700   Min.   :4.800  
##  1st Qu.:6.850   1st Qu.:6.900   1st Qu.:7.700   1st Qu.:7.150  
##  Median :7.500   Median :7.600   Median :8.100   Median :7.800  
##  Mean   :7.293   Mean   :7.384   Mean   :7.935   Mean   :7.602  
##  3rd Qu.:8.000   3rd Qu.:8.050   3rd Qu.:8.500   3rd Qu.:8.250  
##  Max.   :8.900   Max.   :9.000   Max.   :9.100   Max.   :9.200
library(gclus)
## Loading required package: cluster
judge.cor <- cor(USJudgeRatings)
judge.color <- dmat.color(judge.cor)
cpairs(USJudgeRatings, panel.colors=judge.color,
       pch=".", gap=.5)

library(gpairs) 
data(Leaves)
gpairs(Leaves[1:10], lower.pars=list(scatter='loess'))

# Parallel coordinate plot
 
library(GGally)
library(ggplot2)
ggparcoord(iris, columns = 1:4, groupColumn = "Species")

iris1 <- iris
names(iris1) <- c(abbreviate(names(iris)[1:4]), "Species")

summary(iris1)
##       Sp.L            Sp.W            Pt.L            Pt.W      
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
a1 <- ggparcoord(iris1, columns = 1:4, alphaLines = 0.7,  groupColumn = "Species") + xlab("") + ylab("") + ggtitle("a1")
a2 <- ggparcoord(iris1, columns = 1:4, scale="uniminmax", alphaLines=0.7, groupColumn = "Species") + xlab("") + ylab("") + ggtitle("a2")
a3 <- ggparcoord(iris1, columns = 1:4, scale="globalminmax", alphaLines=0.7, groupColumn = "Species") + xlab("") + ylab("") + ggtitle("a3")
a4 <- ggparcoord(iris1, columns = 1:4, scale="center", scaleSummary="median", alphaLines=0.7, groupColumn = "Species") + xlab("") + ylab("") + ggtitle("a4")

gridExtra::grid.arrange(a1, a2, a3, a4)

# Mosaico

titanic <- as.data.frame(Titanic)
par(mfrow=c(2,2),  mar= c(4, 4, 0.1, 0.1))
mosaicplot(xtabs(Freq ~ Survived, data=titanic), main="")
mosaicplot(xtabs(Freq ~ Survived + Sex, data=titanic), main="")
mosaicplot(xtabs(Freq ~ Survived + Sex + Class, data=titanic), main="")
mosaicplot(xtabs(Freq ~ Survived + Sex + Class + Age, data=titanic), main="")

pairs(xtabs(Freq ~ ., data=titanic))

ggplot(titanic, aes(Survived, Freq, fill=Sex)) + 
  geom_bar(stat = "identity") +
  facet_grid(Class ~ Sex + Age) + theme(legend.position="none")

## Nota que estamos usando Titanic y no titanic!
extracat::fluctile(Titanic)

extracat::rmb(formula = ~Sex+Class+Age+Survived, data=titanic, cat.ord=2, spine=TRUE, freq.trans="const")

#UCBA Admissions
    
library(vcd)
## Loading required package: grid
ucb <- data.frame(UCBAdmissions)

class(ucb)
## [1] "data.frame"
str(ucb)
## 'data.frame':    24 obs. of  4 variables:
##  $ Admit : Factor w/ 2 levels "Admitted","Rejected": 1 2 1 2 1 2 1 2 1 2 ...
##  $ Gender: Factor w/ 2 levels "Male","Female": 1 1 2 2 1 1 2 2 1 1 ...
##  $ Dept  : Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ Freq  : num  512 313 89 19 353 207 17 8 120 205 ...
glimpse(ucb)
## Observations: 24
## Variables: 4
## $ Admit  <fct> Admitted, Rejected, Admitted, Rejected, Admitted, Rejec...
## $ Gender <fct> Male, Male, Female, Female, Male, Male, Female, Female,...
## $ Dept   <fct> A, A, A, A, B, B, B, B, C, C, C, C, D, D, D, D, E, E, E...
## $ Freq   <dbl> 512, 313, 89, 19, 353, 207, 17, 8, 120, 205, 202, 391, ...
summary(ucb)
##       Admit       Gender   Dept       Freq      
##  Admitted:12   Male  :12   A:4   Min.   :  8.0  
##  Rejected:12   Female:12   B:4   1st Qu.: 80.0  
##                            C:4   Median :170.0  
##                            D:4   Mean   :188.6  
##                            E:4   3rd Qu.:302.5  
##                            F:4   Max.   :512.0
ucb <- within(ucb, Accept <-
                factor(Admit, levels=c("Rejected", "Admitted"))) 
doubledecker(xtabs(Freq~ Dept + Gender + Accept, data = ucb),
             gp = gpar(fill = c("grey90", "steelblue")))